In [22]:
import logging
import numpy as np
import pandas as pd
root = logging.getLogger()
root.addHandler(logging.StreamHandler())
import datetime
%matplotlib inline
from shapely.prepared import prep
from shapely import speedups
speedups.enable()
In [ ]:
import pandas as pd
important_columns1 = ['species', 'dateidentified', 'eventdate', 'basisofrecord', 'decimallatitude','decimallongitude', 'day', 'month', 'year' ]
result_with_lat_long = pd.DataFrame(columns=important_columns1)
counter = 0
for df in pd.read_msgpack("../data/fish/selection/merged.msg", iterator=True):
counter += 1
if (counter%100==0):
print("%s Processing.. %s " % (datetime.datetime.now().time().isoformat(), counter))
if "decimallatitude" in df.columns.tolist() and "decimallongitude" in df.columns.tolist():
common_columns = list(set(important_columns1).intersection(set(df.columns.tolist())))
result_with_lat_long = result_with_lat_long.append(df[common_columns], ignore_index=True)
In [ ]:
result_with_lat_long = result_with_lat_long[result_with_lat_long.decimallatitude.notnull() & result_with_lat_long.decimallongitude.notnull()]
In [ ]:
result_with_lat_long['species'].unique().size
Best to take into account all observations which have either "year" or "eventdate" present. (or both) Let's group them by species name, and count the number of observation records.
In [ ]:
grouped_lat_long_year_or_eventdate = pd.DataFrame()
grouped_lat_long_year_or_eventdate['count'] = result_with_lat_long[result_with_lat_long.eventdate.notnull() | result_with_lat_long.year.notnull()].groupby(['species']).apply(lambda x: x['species'].count())
grouped_lat_long_year_or_eventdate.head(10) # peak at the top 10 only
In [ ]:
result_with_lat_long['species'].unique().size
In [ ]:
year_or_eventdate_1990 = result_with_lat_long[['species', 'year', 'eventdate', 'basisofrecord', 'decimallatitude', 'decimallongitude']][(result_with_lat_long.year>1990) | (result_with_lat_long.eventdate>"1990")]
grouped_year_or_eventdate_1990 = pd.DataFrame()
grouped_year_or_eventdate_1990['numobservations'] = year_or_eventdate_1990.groupby(['species']).apply(lambda x: x['species'].count())
grouped_year_or_eventdate_1990.shape[0]
In [ ]:
year_or_eventdate_1990.basisofrecord.unique()
I guess we should keep only observations of type 'OBSERVATION', 'MACHINE_OBSERVATION' and 'HUMAN_OBSERVATION'?
In [ ]:
final_selection = year_or_eventdate_1990[(year_or_eventdate_1990.basisofrecord=='OBSERVATION') | (year_or_eventdate_1990.basisofrecord=='HUMAN_OBSERVATION') | (year_or_eventdate_1990.basisofrecord=='MACHINE_OBSERVATION')]
In [ ]:
final_selection.species.unique().shape
In [ ]:
final_selection
In [ ]:
from iSDM.species import GBIFSpecies
In [ ]:
all_species = GBIFSpecies(name_species='All')
In [ ]:
all_species.set_data(final_selection)
In [ ]:
all_species.get_data().species.unique().shape # these many different species
In [ ]:
all_species.get_data().shape[0] # 1939675? this many observations satisfying our criteria (after 1990, with the correct observation type)
In [ ]:
year_or_eventdate_1990.shape[0] # before filtering out types of observations
In [ ]:
all_species.geometrize()
In [ ]:
all_species.get_data().species.unique().shape
In [ ]:
final_observations = all_species.get_data()[['species', 'year','eventdate', 'basisofrecord','geometry']]
In [ ]:
# final_observations.to_file("../data/bias_grid/final_observations", driver="ESRI Shapefile")
In [17]:
from geopandas import GeoDataFrame
final_observations = GeoDataFrame.from_file("../data/bias_grid/final_observations/")
In [18]:
final_observations.head()
Out[18]:
30 arcsec = 0.5/60 pixel = 0.0083333333 'height': 21600, 'width': 43200 for a global map
Generate 2D array and use it as a basis for bias grid.
In [19]:
x_min, y_min, x_max, y_max = -180, -90, 180, 90
pixel_size = 0.083333333 # 0.0083333333
x_res = int((x_max - x_min) / pixel_size)
y_res = int((y_max - y_min) / pixel_size)
In [20]:
x_res
Out[20]:
In [21]:
y_res
Out[21]:
We will use a memory-mapped biasgrid file to prevent the RAM from exploding. At least while results are computed.
In [23]:
bias_grid_mm = np.memmap("../data/bias_grid/bias_grid_mm_5arcmin.dat", dtype='int32', mode='w+', shape=(y_res,x_res))
In [24]:
def increase_pixel(row):
bias_grid_mm[np.abs(int((row.y - 90) / pixel_size)),
np.abs(int((row.x + 180) / pixel_size))]+=1
In [25]:
here = final_observations.geometry.apply(lambda row: increase_pixel(row)) # keeps the memory usage stable
In [26]:
bias_grid_mm.max()
Out[26]:
In [13]:
bias_grid_mm.std() # this still eats memory (uses intermediate datastructures, of course)
Out[13]:
In [27]:
bias_grid_mm.sum()
Out[27]:
In [28]:
# check if the number of all observations is equal to the bias_grid sum of observations
bias_grid_mm.sum() == final_observations.shape[0]
Out[28]:
In [29]:
bias_grid_mm.flush()
In [19]:
del bias_grid_mm
In [ ]:
# to read it, the following is used:
In [16]:
bias_grid_mm = np.memmap("../data/bias_grid/bias_grid_mm.dat", dtype='int32', mode='r', shape=(y_res,x_res))
In [5]:
bias_grid_mm.sum()
Out[5]:
In [6]:
import gc
gc.collect()
Out[6]:
In [30]:
import rasterio
from rasterio.transform import Affine
transform = Affine.translation(x_min, y_max) * Affine.scale(pixel_size, -pixel_size)
crs = {'init': "EPSG:4326"}
In [31]:
transform
Out[31]:
In [32]:
with rasterio.open("../data/bias_grid/bias_grid_5arcmin.tif", 'w', driver='GTiff', width=x_res, height=y_res,
count=1,
dtype=np.uint32,
nodata=0,
transform=transform,
crs=crs) as out:
out.write(bias_grid_mm.astype(np.uint32), indexes=1)
out.close()
In [33]:
bias_grid_mm.shape
Out[33]:
In [ ]: